import os
import math
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import OneHotEncoder, Normalizer, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import Lasso, SGDClassifier, LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection._search import ParameterGrid
from xgboost import XGBClassifier
from scipy.sparse import hstack
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from prettytable import PrettyTable
summary = PrettyTable()
summary.field_names = ['Task','Encoding','Upsampling','Classifier','+ve class Prec/Rec','Auc ROC']
!pip install category_encoders
df = pd.read_table('orange_small_train.data')
print(df.shape)
df.head()
df.describe(include='all')
df.dtypes.unique()
df.isnull().sum()
missing_vals = [i for i in df.isna().sum()]
print('Minimum missing values :', min(missing_vals))
print('Maximum missing values :', max(missing_vals))
plt.figure(figsize=(20,6))
plt.title('Missing Value Heatmap')
plt.xlabel('Features')
plt.ylabel('Indices of Dataframe')
sns.heatmap(df.isnull(), cbar=False)
plt.show()
This implies that we have lot of features with missing values, and few features with no or minimal missing values.
We must do quantative analysis and try to calculate following for each feature:
missing_vals_per = [df[i].isna().sum()*100/len(df[i]) for i in df]
go.Figure(go.Scatter(
x = np.array(range(230)),
y = missing_vals_per
))
fig.update_layout(title='Feature wise Missing Values Plot',
xaxis_title="Features",
yaxis_title="Percentage of missing values",
xaxis = dict(
tickmode = 'array',
tickvals = np.array(range(230)),
ticktext = df.columns.values
)
)
fig.show()
print('Number of features with less than 30 percent missing values (red line) = ', len([i for i in missing_vals_per if i < 30]))
print('Number of features with > 30 and < 90 percent missing values = ', len([i for i in missing_vals_per if i > 30 and i < 90]))
print('Number of features with more more than 90 percent missing values = ', len([i for i in missing_vals_per if i > 90 and i != 100]))
print('Number of features with 100 percent missing values = ', len([i for i in missing_vals_per if i == 100]))
print('-'*40)
plt.plot(sorted(missing_vals_per))
plt.axhline(y=30, color='r', linestyle='-')
plt.xlabel('Features')
plt.ylabel('Percentage of missing values')
plt.title('Missing Values Plot')
plt.grid()
plt.show()
# Code reference : https://stackoverflow.com/a/45121967
df_missing_indicator = df.isna().astype(int).add_suffix('_indicator')
print(df_missing_indicator.shape)
df_missing_indicator.head()
For now, we will remove all the features with more than 30% missing values and do imputation on remaining features.
threshold = 30
df_short = df.drop([i for i,j in dict(zip(df.columns.values, missing_vals_per)).items() if j > threshold], axis=1)
del df
print(df_short.shape)
df_short.head()
df_short.isna().all(axis=1).sum()
appetency = pd.read_table('orange_small_train_appetency.labels', header=None)
appetency.columns = ['appetency']
churn = pd.read_table('orange_small_train_churn.labels', header=None)
churn.columns = ['churn']
upselling = pd.read_table('orange_small_train_upselling.labels', header=None)
upselling.columns = ['upselling']
df_new = pd.concat([df_short,appetency,churn,upselling,df_missing_indicator],axis=1)
print(df_new.shape)
df_new.head()
labels = '-ve Class', '+ve Class'
appetency_sizes = list(df_new.appetency.value_counts())
churn_sizes = list(df_new.churn.value_counts())
upselling_sizes = list(df_new.upselling.value_counts())
explode = (0.1, 0) # only "explode" 1 slice
fig, (ax1,ax2,ax3) = plt.subplots(1,3, figsize=(18,6))
ax1.pie(appetency_sizes, explode=explode, labels=labels, autopct='%1.1f%%',
shadow=True, startangle=90)
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
ax1.set_title('appetency distribution')
ax2.pie(churn_sizes, explode=explode, labels=labels, autopct='%1.1f%%',
shadow=True, startangle=90)
ax2.axis('equal')
ax2.set_title('churn distribution')
ax3.pie(upselling_sizes, explode=explode, labels=labels, autopct='%1.1f%%',
shadow=True, startangle=90)
ax3.axis('equal')
ax3.set_title('upselling distribution')
sns.pairplot(df_new.drop('appetency',axis=1), height=4)
plt.show()